import traceback
import PyPDF2


def extract_text_from_pdf_simple(pdf_path):
    """简化的PDF文本提取，不调用LLM清理文本"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                t = page.extract_text()
                text += t
            # 简单的格式规整，去除多余的空白行
            formatted_text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
            return formatted_text
    except FileNotFoundError as e:
        print(traceback.print_stack())
        print(f"错误: 发生未知错误 - {e}")
    return ""


async def extract_text_from_pdf_simple_async(pdf_path):
    """异步版本的简化PDF文本提取"""
    return extract_text_from_pdf_simple(pdf_path) 